################################################################################
## Group Identities and Parliamentary Debates: Replication package
## Fiva, Nedregård and Øien (2025)

# Description:

## This files takes as input the top 50 divergent words in Norwegian
## and use Google (googletrans Version: 4.0.0-rc.1) to translate 
## to English.
## Words with strange google translations are manually translated

#######################################################################

# %%
## Import packages
from googletrans import Translator # Version: 4.0.0-rc.1
import pandas as pd
import os

# %%
## Set directories

# Get the directory of the current Python script
current_script_directory = os.path.dirname(os.path.abspath(__file__))

d_dir = os.path.join(current_script_directory, '..', '..', '..', 'data', '3_model_output')



### Manual corrections to typos and bad translation (these are applied to all 5 characterisitcs)
# %%

## Manual corrections to typographical errors (these should be applied to all the characteristics)


norwegian_corrections = {
    'kute' : 'kutte',
    'trøndelag' : 'Trøndelag',
    'nordland' : 'Nordland',
    'skattekute' : 'skattekutt',
    'amerika' : 'Amerika',
    'norge' : 'Norge',
    'li'   : 'lide',
    'nrk' : 'NRK',
     'finnmark' : 'Finnmark',
     'svær' : 'svært',
     'østfold' : 'Østfold',
     'nato' : 'NATO',
     'drei' : 'dreie',
     'drammen' : 'Drammen',
     'vestfold': 'Vestfold',
     'groruddal' : 'Groruddalen',
     'russland' : 'Russland',
     'afghanistand': 'Afghanistan',
     'bergen' : 'Bergen',
     'israel' : 'Israel',
     'husbank' : 'Husbanken',
     'akershus' : 'Akershus',
     'hedmark' : 'Hedmark',
     'nsb' : 'NSB',
     'svalbard' : 'Svalbard',
     'statoil' : 'Statoil',
     'nr'  : 'nummer' 
}

tranlation_corrections = {
    'enkelt' : 'simple',
    'borgerlig' : 'bourgeois',
    'kommune' : 'municipality',
    'skattelette' : 'tax relief',
    'skattekute' : 'tax cut',
    'bra' : 'good',
    'fall' : 'fall',
    'satsing' : 'initiative',
    'slag' : 'champion',
    'atomvåpen' : 'nuclear weapon',
    'kr' : 'Norwegian kroner',
    'barnehageplass' : 'kindergarten spot',
    'sal' : 'chamber',
    'Østfold' : 'Østfold',
    'fylkeskommune' : 'regional government',
    'pst' : 'percent',
    'norden' : 'Nordics',
    'sørge' : 'ensure',
    'lur' : 'clever',
    'renne' : 'run',
    'Drammen' : 'Drammen',
    'helsevesen' : "Health service",
    'forsvarsminister': 'Defense Minister',
    'mill' : 'million',
    'læreplan' : 'curriculum',
    'Husbanken' : 'The Housing Bank',
    'Justisminister' : 'Justice Minister',
    'transportplan' : 'transport plan',
    'folk' : 'people',
    'europaråd' : 'The European Council',
    'barn' : 'children',
    'jordbruk' : 'farming',
    'altså' : 'that is',
    'NRK'   : 'Norwegian Broadcasting'
}



################# BLOC ####################################
# %%

## Reading and translating bloc divergent words

df = pd.read_csv(os.path.join(d_dir, 'di_words_top50_bloc.csv'))

# Apply corrections
for norwegian, norwegian_corr in norwegian_corrections.items():
    df.loc[df['words_H'] == norwegian, 'words_H'] = norwegian_corr
    df.loc[df['words_0'] == norwegian, 'words_0'] = norwegian_corr



# %%
# Initialize the translator
translator = Translator()

# Translate specific columns
def translate_column(column):
    return column.apply(lambda x: translator.translate(x, src='no', dest='en').text)

df.insert(df.columns.get_loc('words_H') + 1, 'words_H_translated', translate_column(df['words_H']))
df.insert(df.columns.get_loc('words_0') + 1, 'words_0_translated', translate_column(df['words_0']))


# %%

## Manual translation for strange Google translations



# Apply corrections
for norwegian, english in tranlation_corrections.items():
    df.loc[df['words_H'] == norwegian, 'words_H_translated'] = english
    df.loc[df['words_0'] == norwegian, 'words_0_translated'] = english

# %%
# save file

df.to_csv(os.path.join(d_dir, 'di_trans_words_top50_bloc.csv'))


################# Gender ####################################
# %%

## Reading and translating gender divergent words

df = pd.read_csv(os.path.join(d_dir, 'di_words_top50_gender.csv'))


# %%

## Manual corrections to typographical errors


# Apply corrections
for norwegian, norwegian_corr in norwegian_corrections.items():
    df.loc[df['words_female'] == norwegian, 'words_female'] = norwegian_corr
    df.loc[df['words_0'] == norwegian, 'words_0'] = norwegian_corr

# %%
# Initialize the translator
translator = Translator()

# Translate specific columns
def translate_column(column):
    return column.apply(lambda x: translator.translate(x, src='no', dest='en').text)


df.insert(df.columns.get_loc('words_female') + 1, 'words_female_translated', translate_column(df['words_female']))
df.insert(df.columns.get_loc('words_0') + 1, 'words_0_translated', translate_column(df['words_0']))



# %%
## Manual translation for strange Google translations



# Apply corrections
for norwegian, english in tranlation_corrections.items():
    df.loc[df['words_female'] == norwegian, 'words_female_translated'] = english
    df.loc[df['words_0'] == norwegian, 'words_0_translated'] = english

# %%
# save file

df.to_csv(os.path.join(d_dir, 'di_trans_words_top50_gender.csv'))


###################### Age ################################################


# %%

## Reading and translating age divergende

df = pd.read_csv(os.path.join(d_dir, 'di_words_top50_age.csv'))

# %%

## Manual corrections to typographical errors


# Apply corrections
for norwegian, norwegian_corr in norwegian_corrections.items():
    df.loc[df['words_old'] == norwegian, 'words_old'] = norwegian_corr
    df.loc[df['words_0'] == norwegian, 'words_0'] = norwegian_corr

# %%
# Initialize the translator
translator = Translator()

# Translate specific columns
def translate_column(column):
    return column.apply(lambda x: translator.translate(x, src='no', dest='en').text)


df.insert(df.columns.get_loc('words_old') + 1, 'words_old_translated', translate_column(df['words_old']))
df.insert(df.columns.get_loc('words_0') + 1, 'words_0_translated', translate_column(df['words_0']))



# %%
## Manual translation for strange Google translations

# difficult -- skal være very? Kommer fra svær som jeg gjetter er lemma for svært.


# Apply corrections
for norwegian, english in tranlation_corrections.items():
    df.loc[df['words_old'] == norwegian, 'words_old_translated'] = english
    df.loc[df['words_0'] == norwegian, 'words_0_translated'] = english

# %%
# save file

df.to_csv(os.path.join(d_dir, 'di_trans_words_top50_age.csv'))

###################### Urban ################################################


# %%

## Reading and translating bloc partisanship

df = pd.read_csv(os.path.join(d_dir, 'di_words_top50_urban.csv'))

# %%

## Manual corrections to typographical errors


# Apply corrections
for norwegian, norwegian_corr in norwegian_corrections.items():
    df.loc[df['words_urban'] == norwegian, 'words_urban'] = norwegian_corr
    df.loc[df['words_0'] == norwegian, 'words_0'] = norwegian_corr

# %%
# Initialize the translator
translator = Translator()

# Translate specific columns
def translate_column(column):
    return column.apply(lambda x: translator.translate(x, src='no', dest='en').text)


df.insert(df.columns.get_loc('words_urban') + 1, 'words_urban_translated', translate_column(df['words_urban']))
df.insert(df.columns.get_loc('words_0') + 1, 'words_0_translated', translate_column(df['words_0']))



# %%
## Manual translation for strange Google translations



# Apply corrections
for norwegian, english in tranlation_corrections.items():
    df.loc[df['words_urban'] == norwegian, 'words_urban_translated'] = english
    df.loc[df['words_0'] == norwegian, 'words_0_translated'] = english

# %%
# save file

df.to_csv(os.path.join(d_dir, 'di_trans_words_top50_urban.csv'))

###################### Background ################################################


# %%

## Reading and translating background diversion

df = pd.read_csv(os.path.join(d_dir, 'di_words_top50_background.csv'))

# %%

# Apply corrections
for norwegian, norwegian_corr in norwegian_corrections.items():
    df.loc[df['words_white'] == norwegian, 'words_white'] = norwegian_corr
    df.loc[df['words_0'] == norwegian, 'words_0'] = norwegian_corr

# %%
# Initialize the translator
translator = Translator()

# Translate specific columns
def translate_column(column):
    return column.apply(lambda x: translator.translate(x, src='no', dest='en').text)


df.insert(df.columns.get_loc('words_white') + 1, 'words_white_translated', translate_column(df['words_white']))
df.insert(df.columns.get_loc('words_0') + 1, 'words_0_translated', translate_column(df['words_0']))



# %%
## Manual translation for strange Google translations



# Apply corrections
for norwegian, english in tranlation_corrections.items():
    df.loc[df['words_white'] == norwegian, 'words_white_translated'] = english
    df.loc[df['words_0'] == norwegian, 'words_0_translated'] = english

# %%
# save file

df.to_csv(os.path.join(d_dir, 'di_trans_words_top50_background.csv'))




# %%
